---
title: "Script"
author: "Maria Knapczyk"
format:
html:
toc: true
toc-location: right
toc-title: Spis Treści
number-sections: true
embed-resources: true
html-math-method: katex
code-tools: true
code-fold: show
code-summary: "Show and hide code"
link-external-icon: true
link-external-newwindow: true
smooth-scroll: true
self-contained: true
fig-align: center
execute:
echo: true
error: false
warning: false
output: true
---
## Intro
```{r}
library (tidymodels)
library (tidyverse)
library (stringr)
library (vip)
library (pdp)
library (DALEX)
library (DALEXtra)
library (bestNormalize)
library (rules)
library (baguette)
library (finetune)
library (doParallel)
library (DT)
library (ggplot2)
library (lubridate)
library (future)
tidymodels_prefer ()
```
```{r}
data = read.csv ("coffee_shop_sales.csv" )
datatable (data)
```
## Data
```{r}
str (data)
```
```{r}
data <- data |>
as_tibble () |>
janitor:: clean_names () |>
mutate (
transaction_date = as.Date (transaction_date),
transaction_datetime = as.POSIXct (
paste (transaction_date, transaction_time),
format= "%Y-%m-%d %H:%M:%S"
)
) |>
mutate (
hour = hour (transaction_datetime)
) |>
mutate (
wday = wday (transaction_date, label = TRUE , abbr = TRUE ),
day_work = if_else (wday %in% c ("Sat" , "Sun" ), "weekend" , "week" ),
day_work = factor (day_work)
)
```
```{r}
data |> summary ()
```
```{r}
colSums (is.na (data))
```
```{r}
cor (data[sapply (data, is.numeric)])
```
## Plots
```{r}
ggplot (data, aes (x= total_price)) +
labs (
title = "Histogram cen" ,
x = "Cena" ,
y = "Ilość"
) +
geom_histogram (binwidth = 1 , fill = "cadetblue" , color = "black" )
ggplot (data, aes (x= loyalty_points_earned)) +
labs (
title = "Histogram zdobytych punktów lojanościowych" ,
x = "Przedziały" ,
y = "Ilość"
) +
geom_histogram (binwidth = 1 , fill = "cadetblue" , color = "black" )
ggplot (data, aes (x = product_category)) +
geom_bar (fill = "cadetblue" , color = "black" ) +
labs (
title = "Liczba transakcji w poszczególnych kategoriach produktów" ,
x = "Kategoria produktu" ,
y = "Liczba transakcji"
)
ggplot (data, aes (x= payment_method)) + geom_bar (fill = "cadetblue" , color = "black" ) +
labs (
title = "Liczba transakcji poszczególnych metod płatniczych" ,
x = "Metoda płatności" ,
y = "Liczba transakcji"
)
ggplot (data, aes (x= location)) + geom_bar (fill = "cadetblue" , color = "black" ) +
labs (
title = "Liczba transakcji w poszczególnych lokalizacjach" ,
x = "Lokalizacja" ,
y = "Liczba transakcji"
)
```
```{r}
hourly_products <- data %>%
group_by (hour, product_name) %>%
summarise (transactions = n (), .groups = "drop" ) %>%
arrange (hour, desc (transactions)) %>%
group_by (hour) %>%
slice_max (transactions, n = 3 )
ggplot (hourly_products, aes (x = factor (hour), y = transactions, fill = product_name)) +
geom_col () +
labs (
title = "Najczęściej kupowany produkt w każdej godzinie" ,
x = "Godzina" ,
y = "Liczba transakcji" ,
fill = "Produkt"
)
```
## Formula
```{r}
forms <- formula (total_price ~ .)
```
## Data split
```{r}
set.seed (111 )
trans_split <- initial_validation_split (data = data, strata = total_price)
trans_train <- training (trans_split)
trans_test <- testing (trans_split)
trans_valid <- validation_set (trans_split)
trans_folds <- vfold_cv (trans_train, strata = total_price, v = 10 , repeats = 5 ) # testowo
save (trans_split, trans_train, trans_test, trans_valid, trans_folds,file = "coffesplitted_data.Rdata" )
```
```{r}
load ("coffesplitted_data.Rdata" )
```
## Model
```{r}
# CART
cart_spec <- decision_tree (
cost_complexity = tune (),
min_n = tune ()
) %>%
set_engine ("rpart" ) %>%
set_mode ("regression" )
# Random Forest
rf_spec <- rand_forest (
mtry = tune (),
min_n = tune (),
trees = tune ()
) %>%
set_engine ("ranger" ) %>%
set_mode ("regression" )
# Cubist
cubist_spec <- cubist_rules (
committees = tune (),
neighbors = tune ()
) %>%
set_engine ("Cubist" ) %>%
set_mode ("regression" )
# XGBoost
xgb_spec <- boost_tree (
tree_depth = tune (),
learn_rate = tune (),
loss_reduction = tune (),
min_n = tune (),
sample_size = tune (),
trees = tune ()
) %>%
set_engine ("xgboost" ) %>%
set_mode ("regression" )
# MARS
mars_spec <- mars (
num_terms = tune (),
prod_degree = tune ()
) %>%
set_engine ("earth" ) %>%
set_mode ("regression" )
# knn
knn_spec <- nearest_neighbor (
neighbors = tune ()
) %>%
set_engine ("kknn" ) %>%
set_mode ("regression" )
```
## Recipe
```{r}
basic_rec <- recipe (total_price ~ ., data = trans_train) %>%
step_mutate (hour = as.numeric (hour)) %>%
update_role (
transaction_id, transaction_date, transaction_time,
product_id, customer_id, barista_id,
new_role = "ID"
) %>%
step_rm (quantity, unit_price, loyalty_points_earned)
basic_t_rec <- recipe (total_price ~ ., data = trans_train) %>%
update_role (transaction_id, transaction_date, transaction_time,
product_id, customer_id, barista_id, new_role = "ID" ) %>%
step_mutate (
transaction_datetime = as.numeric (transaction_datetime),
hour = as.numeric (hour)
) %>%
step_rm (quantity, unit_price, loyalty_points_earned) %>%
step_unknown (all_nominal_predictors ()) %>%
step_dummy (all_nominal_predictors ()) %>%
step_zv (all_nominal_predictors ())
cubist_rec <- recipe (total_price ~ ., data = trans_train) %>%
update_role (transaction_id, transaction_date, transaction_time,
product_id, customer_id, barista_id, new_role = "ID" ) %>%
step_mutate (
transaction_datetime = as.numeric (transaction_datetime),
hour = as.numeric (hour)
) %>%
step_rm (quantity, unit_price, loyalty_points_earned) %>%
step_unknown (all_nominal_predictors ()) %>%
step_novel (all_nominal_predictors ()) %>%
step_string2factor (all_nominal_predictors ()) %>%
step_zv (all_predictors ())
prep (basic_t_rec, training = trans_train) %>% juice ()
```
```{r}
prep (basic_rec, training = trans_train) |> juice ()
```
```{r}
summary (basic_rec) |> knitr:: kable ()
```
## Workflow
```{r}
a <- workflow_set (
preproc = list (b = basic_rec),
models = list (
rpart = cart_spec,
ranger = rf_spec
)
)
b <- workflow_set (
preproc = list (t = basic_t_rec),
models = list (xgboost = xgb_spec
)
)
c <- workflow_set (
preproc = list (cubist = cubist_rec),
models = list (cubist = cubist_spec)
)
d <- workflow_set (
preproc = list (transformed = basic_rec),
models = list (
mars = mars_spec,
knn = knn_spec
)
)
basic <- bind_rows (a, b, c, d)
basic$ wflow_id <- str_sub (basic$ wflow_id, start = 3 , end = 100 )
basic
```
## Parameters
```{r}
#n nrow i p predyktory
cart_param <- cart_spec |>
extract_parameter_set_dials () |>
update (
min_n = min_n (c (5 , 30 )) # <1% z n
)
rf_param <- rf_spec |>
extract_parameter_set_dials () |>
update (
min_n = min_n (c (5 , 30 )),
mtry = mtry (c (4 , 9 )), # sqrt i polowa p
trees = trees (c (100 , 500 ))
)
xgb_param <- xgb_spec |>
extract_parameter_set_dials () |>
update (
min_n = min_n (c (5 , 30 )),
trees = trees (c (100 , 500 )),
tree_depth = tree_depth (c (2 ,10 )),
learn_rate = learn_rate (c (0.01 , 0.3 )),
loss_reduction = loss_reduction (c (0 , 5 )),
sample_size = sample_prop (c (0.5 , 1 ))
)
cubist_param <- cubist_spec |>
extract_parameter_set_dials () |>
update (
committees = committees (c (1 , 10 )),
neighbors = neighbors (c (0 , 5 ))
)
mars_param <- mars_spec |>
extract_parameter_set_dials () |>
update (
num_terms = num_terms (c (2 , 30 )),
prod_degree = prod_degree (c (1 , 2 ))
)
knn_param <- knn_spec |>
extract_parameter_set_dials () |>
update (
neighbors = neighbors (c (3 , 15 ))
)
basic <- basic |> option_add (param_info = cart_param, id = "rpart" )
basic <- basic |> option_add (param_info = rf_param, id = "ranger" )
basic <- basic |> option_add (param_info = xgb_param, id = "xgboost" )
basic <- basic |> option_add (param_info = cubist_param, id = "cubist" )
basic <- basic |> option_add (param_info = mars_param, id = "mars" )
basic <- basic |> option_add (param_info = knn_param, id = "knn" )
basic
```
```{r}
basic |>
split (~ wflow_id) |>
map (
\(x) extract_parameter_set_dials (
x = x,
id = x$ wflow_id
) |>
_$ object
)
```
## Grid and tune
```{r}
race_ctrl <- control_race (
save_pred = TRUE ,
parallel_over = "everything" ,
save_workflow = FALSE ,
verbose = TRUE
)
```
```{r}
cores <- parallel:: detectCores (logical = FALSE )
cl <- makePSOCKcluster (cores)
registerDoParallel (cl)
race_models <- basic |>
filter (wflow_id %in% c ("rpart" , "ranger" , "bist_cubist" , "xgboost" ))
time_race <- Sys.time ()
race_result <- workflow_map (
race_models,
"tune_race_anova" ,
seed = 111 ,
resamples = trans_folds,
grid = 50 ,
control = race_ctrl,
verbose = TRUE ,
metrics = metric_set (rmse, mae, rsq)
)
Sys.time () - time_race
```
```{r}
grid_ctrl <- control_grid (
save_pred = TRUE ,
parallel_over = "everything"
)
# workflow_set już masz
grid_models <- workflow_set (
preproc = list (transformed = basic_t_rec),
models = list (
mars = mars_spec,
knn = knn_spec
)
)
# strojenie wszystkich workflowów naraz
grid_result <- workflow_map (
grid_models,
"tune_grid" ,
resamples = trans_folds,
grid = 20 ,
metrics = metric_set (rmse, mae, rsq),
control = grid_ctrl
)
stopCluster (cl)
save (race_result, grid_result, file = "tune_results_split.Rdata" )
```
## Best model selection
```{r}
load ("tune_results_split.Rdata" )
combined_results <- bind_rows (race_result, grid_result)
best_results <-
combined_results |>
split (~ wflow_id) |>
map (
\(x)
extract_workflow_set_result (x = x, id = x$ wflow_id) |>
select_best (metric = "rmse" , )
)
best_models <-
combined_results |>
split (~ wflow_id) |>
map (
\(x)
extract_workflow (x = x, id = x$ wflow_id) |>
finalize_workflow (best_results[[x$ wflow_id]]) |>
last_fit (
split = trans_split,
metrics = metric_set (rmse, rsq, mae)
)
)
save (best_models, file = "best_models.rdata" )
```
```{r}
load ("tune_result.Rdata" )
load ("best_models.rdata" )
```
## Result tune
```{r}
combined_results |>
split (~ wflow_id) |>
map (
\(x)
extract_workflow_set_result (x = x, id = x$ wflow_id) |>
show_best (metric = "rmse" , n = 1 ) |>
select (- n, - .metric, - .config)
) |>
knitr:: kable ()
```
```{r}
combined_results |>
rank_results (select_best = T) |>
unite ("rate" , c ("mean" , "std_err" ), sep = "/" ) |>
pivot_wider (names_from = .metric, values_from = rate) |>
separate_wider_delim (
cols = mae: rsq,
delim = "/" ,
names = c ("" , "_std_err" ),
names_sep = "" ) |>
select (- preprocessor, - n, - model) |>
mutate (.config = str_sub (.config, 20 , 30 )) |>
mutate (across (
.cols = mae: rsq_std_err,
.fns = \(x) signif (x = as.numeric (x), digits = 3 )
)) |> arrange (mae) |>
gt:: gt () |>
gt:: tab_header (title = "Wyniki oceny dla zestawu walidacyjnego" )
```
```{r}
combined_results |>
rank_results (select_best = T) |>
mutate (wflow_id = fct_reorder (wflow_id, rank)) |>
ggplot (aes (wflow_id, mean, colour = wflow_id)) +
geom_point () +
geom_errorbar (aes (ymin = mean - 1.96 * std_err,
ymax = mean + 1.96 * std_err), width = 0.8 ) +
facet_wrap (~ .metric, scales = "free_y" ) +
theme_bw () +
labs (x = "model" ,
y = "value metric" ,
colour = "model" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
```
```{r}
ggsave (filename = "rys. 3._valid_result_model.jpeg" , device = "jpeg" ,
width = 8.5 , height = 3 , dpi = 300 )
```
```{r}
all_fit_metrics <-
combined_results |>
split (~ wflow_id) |>
map (
\(x)
extract_workflow_set_result (x = x, id = x$ wflow_id) |>
_$ .metrics |>
_[[1 ]] |>
mutate (.config = str_sub (.config, 20 , 24 ))
)
```
### Ranger
```{r}
c ("mae" , "rmse" , "rsq" ) |>
map_dfr (
\(x)
combined_results |>
extract_workflow_set_result (id = "ranger" ) |>
select_best (metric = x), .id = ".metric" ) |>
gt:: gt ()
```
```{r}
all_fit_metrics[["ranger" ]] |> # Zmień model ...
filter (.metric == "rmse" ) |> # Zmień statystykę rsq, mae
ggplot (aes (trees, .estimate, color = factor (min_n))) +
geom_point () +
facet_wrap (~ mtry) +
scale_x_continuous (limits = c (0 , 600 ), breaks = seq (0 , 600 , 100 )) +
ggtitle (label = "ranger" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
```
Najlepszy mtry = 4
300/400 drzew jest wystarczjaące
Najniższe RMSE dla sporej liczby min_n - koło 25, drzewa płytsze i zgeneralizowane
```{r}
all_fit_metrics[["ranger" ]] |>
filter (.metric == "rmse" , .estimate < 2.74 ) |>
arrange (.estimate) |>
gt:: gt ()
```
Najlepsze parametry z config 07
### Cubist
```{r}
c ("mae" , "rmse" , "rsq" ) |>
map_dfr (
\(x)
combined_results |>
extract_workflow_set_result (id = "bist_cubist" ) |>
select_best (metric = x), .id = ".metric" ) |>
gt:: gt ()
```
```{r}
all_fit_metrics[["bist_cubist" ]] |> # Zmień model ...
filter (.metric == "rmse" ) |> # Zmień statystykę rsq, mae
ggplot (aes (committees, .estimate, color = factor (neighbors))) +
geom_point (position = position_jitter (width = 0.5 , height = 0 )) +
facet_wrap (~ neighbors, scales = "free_y" ) +
scale_x_continuous (limits = c (0 , 120 ), breaks = seq (0 , 120 , 20 ), expand = c (0 , 0 )) +
ggtitle (label = "Cubist" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
```
Najlepsze neighbors 0 i dla najwiekszych im zwiekszam tym sie rmse zmniejsza.
```{r}
all_fit_metrics[["bist_cubist" ]] |> # Zmień model ...
filter (.metric == "rmse" ) |> # Zmień statystykę rsq, mae
filter (between (neighbors, 7 , 9 )) |>
ggplot (aes (committees, .estimate, color = factor (neighbors))) +
geom_point () +
geom_line () +
scale_x_continuous (limits = c (0 , 120 ), breaks = seq (0 , 120 , 20 ), expand = c (0 , 0 )) +
ggtitle (label = "Cubist" )
```
```{r}
all_fit_metrics[["bist_cubist" ]] |>
filter (.metric == "rmse" , .estimate < 2.86 ) |>
arrange (.estimate) |>
gt:: gt ()
```
Dla małych neighbors dodatkowe komisje nie mają sensu, model jest już optymalny.
Dla większych neighbors dodanie komisji nieznacznie zmienia RMSE, efekt jest minimalny.
### Rpart
```{r}
c ("mae" , "rmse" , "rsq" ) |>
map_dfr (
\(x)
combined_results |>
extract_workflow_set_result (id = "rpart" ) |>
select_best (metric = x) |>
mutate (.metric = x)) |>
gt:: gt ()
```
```{r}
all_fit_metrics[["rpart" ]] |> # Zmień model ...
filter (.metric == "rmse" ) |> # Zmień statystykę rsq, mae
ggplot (aes (cost_complexity, .estimate, color = factor (min_n))) +
geom_point () +
geom_line () +
# facet_wrap(~min_n) +
scale_x_log10 (breaks = breaks_log (n = 10 , base = 10 ), labels = label_log (base = 10 )) +
labs (title = "rpart" , color = "min_n" , y = "rmse" )
```
```{r}
all_fit_metrics[["rpart" ]] |>
filter (.metric == "rmse" ) |>
slice_min (.estimate, n = 3 )
```
### xgboost
```{r}
c ("mae" , "rmse" , "rsq" ) |>
map_dfr (
\(x)
combined_results |>
extract_workflow_set_result (id = "xgboost" ) |>
select_best (metric = x) |>
mutate (.metric = x)) |>
gt:: gt ()
```
```{r}
all_fit_metrics[["xgboost" ]] |> # Zmień model ...
filter (.metric == "rmse" ) |> # Zmień statystykę rsq, mae
ggplot (aes (trees, .estimate, color = factor (min_n))) +
geom_point () +
facet_wrap (~ tree_depth, scales = "free_y" ) +
labs (title = "xgboost" , color = "min_n" , y = "rmse" )
```
rmse <2.8
```{r}
all_fit_metrics[["xgboost" ]] |> # Zmień model ...
filter (.metric == "rmse" , .estimate < 2.8 ) |> # Zmień statystykę rsq, mae
ggplot (aes (trees, .estimate, color = factor (min_n))) +
geom_point () +
facet_wrap (~ tree_depth) +
labs (title = "xgboost" , color = "min_n" , y = "rmse" )
```
min_n 9 oraz depth 5
### Transformed mars
```{r}
c ("mae" , "rmse" , "rsq" ) |>
map_dfr (
\(x)
combined_results |>
extract_workflow_set_result (id = "transformed_mars" ) |>
select_best (metric = x), .id = ".metric" ) |>
gt:: gt ()
```
```{r}
all_fit_metrics[["transformed_mars" ]] |>
filter (.metric == "rmse" ) |>
ggplot (aes (num_terms, .estimate, color = factor (prod_degree))) +
geom_point () +
geom_line () +
facet_wrap (~ prod_degree, scales = "free_y" ) +
ggtitle (label = "transformed_mars" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
```
minimalne różnice
### knn
```{r}
c ("mae" , "rmse" , "rsq" ) |>
map_dfr (
\(x)
combined_results |>
extract_workflow_set_result (id = "transformed_knn" ) |>
select_best (metric = x), .id = ".metric" ) |>
gt:: gt ()
```
```{r}
colnames (all_fit_metrics[["transformed_knn" ]])
```
```{r}
all_fit_metrics[["transformed_knn" ]] |>
filter (.metric == "rmse" ) |>
ggplot (aes (neighbors, .estimate, color = factor (neighbors))) +
geom_point () +
geom_line (aes (group = 1 )) +
ggtitle (label = "transformed_knn" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
```
RMSE maleje wraz ze wzrostem sasiadow
```{r}
all_fit_metrics[["transformed_knn" ]] |>
filter (.metric == "rmse" , .estimate < 2.87 ) |>
arrange (.estimate) |>
gt:: gt ()
```
Każda z opcji jest dobra - małe różnice
## Result best models
```{r}
p <-
best_models |>
map_dfr (.f = \(x) collect_metrics (x = x), .id = "mod" ) |>
mutate (mod = factor (mod, levels = c ("rpart" , "ranger" , "bist_cubist" , "xgboost" , "transformed_mars" , "transformed_knn" ))) |>
ggplot (aes (mod, .estimate, colour = mod)) +
geom_point () +
facet_wrap (~ .metric, scales = "free_y" ) +
theme_bw () +
labs (
x = "Model" ,
y = "Wartosc metryki" ,
colour = "Model"
) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
p
```
```{r}
ggsave (
filename = "rys_4_test_result_model.jpeg" , device = "jpeg" ,
width = 8.5 , height = 3 , dpi = 300
)
```
Testowy vs walidacyjny
```{r}
por_test_valid <-
bind_rows (
best_models |>
map_dfr (\(x) collect_metrics (x = x), .id = "mod" ) |>
select (mod, .metric, .estimate) |>
rename (mean = .estimate) |>
mutate (typ = "testowy" ),
combined_results |>
rank_results (select_best = T) |>
select (wflow_id, .metric, mean) |>
rename (mod = wflow_id) |>
mutate (typ = "walidacyjny" )
) |>
pivot_wider (
names_from = typ,
values_from = mean
)
por_test_valid |>
gt:: gt () |>
gt:: fmt_number (n_sigfig = 3 )
```
```{r}
por_test_valid |>
filter (.metric == "rmse" ) |>
ggplot (aes (walidacyjny, testowy, colour = mod)) +
geom_point (size = 4 ) +
theme_bw () +
geom_abline (slope = 1 ) +
labs (
x = "Walidacyjny" ,
y = "Testowy" ,
colour = "Model"
) +
coord_fixed (ratio = 1 ) +
xlim (2.5 , 3 ) + ylim (2.5 , 3 )
```
```{r}
por_test_valid |>
filter (.metric == "rsq" ) |>
ggplot (aes (walidacyjny, testowy, colour = mod)) +
geom_point (size = 4 ) +
theme_bw () +
geom_abline (slope = 1 ) +
labs (
x = "Walidacyjny" ,
y = "Testowy" ,
colour = "Model"
) +
coord_fixed (ratio = 1 ) +
expand_limits (x = c (0 ,0.25 ), y = c (0 ,0.25 )) +
scale_x_continuous (expand = c (0 ,0 ), breaks = seq (0 , 0.25 , 0.05 )) +
scale_y_continuous (expand = c (0 ,0 ), breaks = seq (0 , 0.25 , 0.05 ))
```
```{r}
por_test_valid |>
filter (.metric == "mae" ) |>
ggplot (aes (walidacyjny, testowy, colour = mod)) +
geom_point (size = 4 ) +
theme_bw () +
geom_abline (slope = 1 ) +
labs (
x = "Walidacyjny" ,
y = "Testowy" ,
colour = "Model"
) +
coord_fixed (ratio = 1 ) +
expand_limits (x = c (0 ,3 ), y = c (0 ,3 )) +
scale_x_continuous (expand = c (0 ,0 ), breaks = seq (0 , 3 , 0.5 )) +
scale_y_continuous (expand = c (0 ,0 ), breaks = seq (0 , 3 , 0.5 ))
```
Wykres rozrzutu
```{r}
my_breaks <- c (1 , 4 , 9 , 16 , 25 , 36 , 49 , 64 , 81 , 100 )
range_axis_x_y <-
best_models |>
map_dfr (
\(x) augment (x = x),
.id = "mod"
) |>
summarise (max = max (max (total_price), max (.pred))) |>
pull ()
best_models |>
map_dfr (
\(x) augment (x = x),
.id = "mod"
) |>
ggplot (aes (total_price, .pred)) +
geom_hex () +
facet_wrap (~ mod) +
geom_abline (slope = rep (c (1.5 , 0.5 , 1 ), 4 )) +
theme_bw () +
viridis:: scale_fill_viridis (
breaks = my_breaks,
labels = my_breaks,
option = "D" ,
direction = 1 ,
trans = scales:: pseudo_log_trans (sigma = 1 )
) +
scale_x_continuous (expand = c (0 , 0 ), limits = c (0 , range_axis_x_y)) +
scale_y_continuous (expand = c (0 , 0 ), limits = c (0 , range_axis_x_y)) +
coord_obs_pred () +
labs (
x = expression ("Cena całkowita" ),
y = expression ("Cena całkowita" ),
fill = "Czestosc" , title = "Ocena na zestawie testowym"
)
```
```{r}
ggsave (
filename = "fig_2_Best_models_for_test_data.jpeg" ,
device = "jpeg" ,
width = 8.00 ,
height = 4.60 ,
dpi = 300
)
```
```{r}
best_models |>
map_dfr (
\(x) augment (x = x),
.id = "mod"
) |>
openair:: modStats (mod = ".pred" , obs = "total_price" , type = "mod" ) |>
arrange (FAC2) |>
select (- P) |>
gt:: gt () |>
gt:: fmt_number (columns = FAC2: IOA, n_sigfig = 3 )
```
Najniższe RMSE: cubist
Najwyższe r: cubist
Najwyższe IOA: cubist ( Index of Agreement based on Willmott et al. (2011), which spans between -1 and +1 with values approaching +1 representing better model performance)
Najwyższe FAC2: dla cubist wyjątkowo dobre oszacowanie, bliskie 1.
Cubist wypadł najlepiej, modele drzew również dobrze sobie radzą, modele transformacyjne działają słabiej.
Top3: cubist, rpart, xgboost.